@article{DBLP:journals/pvldb/ZhangR14,
  author    = {Ce Zhang and
               Christopher R{\'{e}}},
  title     = {DimmWitted: {A} Study of Main-Memory Statistical Analytics},
  journal   = {Proc. {VLDB} Endow.},
  volume    = {7},
  number    = {12},
  pages     = {1283--1294},
  year      = {2014},
  url       = {http://www.vldb.org/pvldb/vol7/p1283-zhang.pdf},
  doi       = {10.14778/2732977.2733001},
  timestamp = {Sat, 25 Apr 2020 13:59:41 +0200},
  biburl    = {https://dblp.org/rec/journals/pvldb/ZhangR14.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{isca20Centaur,
  author    = {Ranggi Hwang and
               Taehun Kim and
               Youngeun Kwon and
               Minsoo Rhu},
  title     = {Centaur: {A} Chiplet-based, Hybrid Sparse-Dense Accelerator for Personalized
               Recommendations},
  booktitle = {{ISCA} 2020},
  year      = {2020},
}

@article{parax,
  author    = {Anonymous Authors},
  title     = {ParaX: Boosting Deep Learning on Many-Core {CPUs}},
  booktitle = {Submitted to {ASPLOS}},
  year      = {2020},
}

@inproceedings{isca20RecNMP,
  author    = {Liu Ke and
               Udit Gupta and
               Carole{-}Jean Wu and
               Benjamin Youngjae Cho and
               Mark Hempstead and
               Brandon Reagen and
               Xuan Zhang and
               David M. Brooks and
               Vikas Chandra and
               Utku Diril and
               Amin Firoozshahian and
               Kim M. Hazelwood and
               Bill Jia and
               Hsien{-}Hsin S. Lee and
               Meng Li and
               Bert Maher and
               Dheevatsa Mudigere and
               Maxim Naumov and
               Martin Schatz and
               Mikhail Smelyanskiy and
               Xiaodong Wang},
  title     = {RecNMP: Accelerating Personalized Recommendation with Near-Memory
               Processing},
  booktitle = {{ISCA} 2020},
  year      = {2020},
}



@inproceedings{isca20DeepRecSys,
  author    = {Udit Gupta and
               Samuel Hsia and
               Vikram Saraph and
               Xiaodong Wang and
               Brandon Reagen and
               Gu{-}Yeon Wei and
               Hsien{-}Hsin S. Lee and
               David Brooks and
               Carole{-}Jean Wu},
               title     = {DeepRecSys: {A} System for Optimizing End-To-End At-scale Neural Recommendation
               Inference},  
               booktitle = {{ISCA} 2020},
  year      = {2020},
}

@inproceedings{DBLP:conf/hpca/GuptaWWNR0CHHJL20,
  author    = {Udit Gupta and
               Carole{-}Jean Wu and
               Xiaodong Wang and
               Maxim Naumov and
               Brandon Reagen and
               David Brooks and
               Bradford Cottel and
               Kim M. Hazelwood and
               Mark Hempstead and
               Bill Jia and
               Hsien{-}Hsin S. Lee and
               Andrey Malevich and
               Dheevatsa Mudigere and
               Mikhail Smelyanskiy and
               Liang Xiong and
               Xuan Zhang},
  title     = {The Architectural Implications of Facebook's DNN-Based Personalized
               Recommendation},
  booktitle = {{IEEE} International Symposium on High Performance Computer Architecture,
               {HPCA} 2020, San Diego, CA, USA, February 22-26, 2020},
  pages     = {488--501},
  publisher = {{IEEE}},
  year      = {2020},
  url       = {https://doi.org/10.1109/HPCA47549.2020.00047},
  doi       = {10.1109/HPCA47549.2020.00047},
  timestamp = {Wed, 29 Apr 2020 10:53:08 +0200},
  biburl    = {https://dblp.org/rec/conf/hpca/GuptaWWNR0CHHJL20.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/micro/KwonLR19,
  author    = {Youngeun Kwon and
               Yunjae Lee and
               Minsoo Rhu},
  title     = {TensorDIMM: {A} Practical Near-Memory Processing Architecture for
               Embeddings and Tensor Operations in Deep Learning},
  booktitle = {Proceedings of the 52nd Annual {IEEE/ACM} International Symposium
               on Microarchitecture, {MICRO} 2019, Columbus, OH, USA, October 12-16,
               2019},
  pages     = {740--753},
  publisher = {{ACM}},
  year      = {2019},
  url       = {https://doi.org/10.1145/3352460.3358284},
  doi       = {10.1145/3352460.3358284},
  timestamp = {Wed, 16 Oct 2019 10:12:02 +0200},
  biburl    = {https://dblp.org/rec/conf/micro/KwonLR19.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/asplos/ChenYMT16,
  author    = {Quan Chen and
               Hailong Yang and
               Jason Mars and
               Lingjia Tang},
  editor    = {Tom Conte and
               Yuanyuan Zhou},
  title     = {Baymax: QoS Awareness and Increased Utilization for Non-Preemptive
               Accelerators in Warehouse Scale Computers},
  booktitle = {Proceedings of the Twenty-First International Conference on Architectural
               Support for Programming Languages and Operating Systems, {ASPLOS}
               '16, Atlanta, GA, USA, April 2-6, 2016},
  pages     = {681--696},
  publisher = {{ACM}},
  year      = {2016},
  url       = {https://doi.org/10.1145/2872362.2872368},
  doi       = {10.1145/2872362.2872368},
  timestamp = {Tue, 03 Sep 2019 08:30:33 +0200},
  biburl    = {https://dblp.org/rec/conf/asplos/ChenYMT16.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/asplos/ChenYGKMT17,
  author    = {Quan Chen and
               Hailong Yang and
               Minyi Guo and
               Ram Srivatsa Kannan and
               Jason Mars and
               Lingjia Tang},
  title     = {Prophet: Precise QoS Prediction on Non-Preemptive Accelerators to
               Improve Utilization in Warehouse-Scale Computers},
  booktitle = {Proceedings of the Twenty-Second International Conference on Architectural
               Support for Programming Languages and Operating Systems, {ASPLOS}
               2017, Xi'an, China, April 8-12, 2017},
  pages     = {17--32},
  publisher = {{ACM}},
  year      = {2017},
}

@inproceedings{DBLP:conf/iscas/Olsen18,
  author    = {Eric B. Olsen},
  title     = {{RNS} Hardware Matrix Multiplier for High Precision Neural Network
               Acceleration: "RNS TPU"},
  booktitle = {{IEEE} International Symposium on Circuits and Systems, {ISCAS} 2018,
               27-30 May 2018, Florence, Italy},
  pages     = {1--5},
  publisher = {{IEEE}},
  year      = {2018},
  url       = {https://doi.org/10.1109/ISCAS.2018.8351352},
  doi       = {10.1109/ISCAS.2018.8351352},
  timestamp = {Wed, 16 Oct 2019 14:14:49 +0200},
  biburl    = {https://dblp.org/rec/conf/iscas/Olsen18.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/micro/FowersOPMLLAHAG19,
  author    = {Jeremy Fowers and
               Kalin Ovtcharov and
               Michael K. Papamichael and
               Todd Massengill and
               Ming Liu and
               Daniel Lo and
               Shlomi Alkalay and
               Michael Haselman and
               Logan Adams and
               Mahdi Ghandi and
               Stephen Heil and
               Prerak Patel and
               Adam Sapek and
               Gabriel Weisz and
               Lisa Woods and
               Sitaram Lanka and
               Steven K. Reinhardt and
               Adrian M. Caulfield and
               Eric S. Chung and
               Doug Burger},
  title     = {Inside Project Brainwave's Cloud-Scale, Real-Time {AI} Processor},
  journal   = {{IEEE} Micro},
  volume    = {39},
  number    = {3},
  pages     = {20--28},
  year      = {2019},
  url       = {https://doi.org/10.1109/MM.2019.2910506},
  doi       = {10.1109/MM.2019.2910506},
  timestamp = {Thu, 16 May 2019 09:14:54 +0200},
  biburl    = {https://dblp.org/rec/journals/micro/FowersOPMLLAHAG19.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/opml/ZhangRWZRRLWH19,
  author    = {Minjia Zhang and
               Samyam Rajbhandari and
               Wenhan Wang and
               Elton Zheng and
               Olatunji Ruwase and
               Jeff Rasley and
               Jason Li and
               Junhua Wang and
               Yuxiong He},
  editor    = {Bharath Ramsundar and
               Nisha Talagala},
  title     = {Accelerating Large Scale Deep Learning Inference through DeepCPU at
               Microsoft},
  booktitle = {2019 {USENIX} Conference on Operational Machine Learning, OpML 2019,
               Santa Clara, CA, USA, May 20, 2019},
  pages     = {5--7},
  publisher = {{USENIX} Association},
  year      = {2019},
  url       = {https://www.usenix.org/conference/opml19/presentation/zhang-minjia},
  timestamp = {Mon, 27 May 2019 12:19:54 +0200},
  biburl    = {https://dblp.org/rec/conf/opml/ZhangRWZRRLWH19.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/kdd/LiZCS14,
  author    = {Mu Li and
               Tong Zhang and
               Yuqiang Chen and
               Alexander J. Smola},
  editor    = {Sofus A. Macskassy and
               Claudia Perlich and
               Jure Leskovec and
               Wei Wang and
               Rayid Ghani},
  title     = {Efficient mini-batch training for stochastic optimization},
  booktitle = {The 20th {ACM} {SIGKDD} International Conference on Knowledge Discovery
               and Data Mining, {KDD} '14, New York, NY, {USA} - August 24 - 27,
               2014},
  pages     = {661--670},
  publisher = {{ACM}},
  year      = {2014},
  url       = {https://doi.org/10.1145/2623330.2623612},
  doi       = {10.1145/2623330.2623612},
  timestamp = {Tue, 06 Nov 2018 16:59:35 +0100},
  biburl    = {https://dblp.org/rec/conf/kdd/LiZCS14.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/WuSCLNMKCGMKSJL16,
  author    = {Yonghui Wu and
               Mike Schuster and
               Zhifeng Chen and
               Quoc V. Le and
               Mohammad Norouzi and
               Wolfgang Macherey and
               Maxim Krikun and
               Yuan Cao and
               Qin Gao and
               Klaus Macherey and
               Jeff Klingner and
               Apurva Shah and
               Melvin Johnson and
               Xiaobing Liu and
               Lukasz Kaiser and
               Stephan Gouws and
               Yoshikiyo Kato and
               Taku Kudo and
               Hideto Kazawa and
               Keith Stevens and
               George Kurian and
               Nishant Patil and
               Wei Wang and
               Cliff Young and
               Jason Smith and
               Jason Riesa and
               Alex Rudnick and
               Oriol Vinyals and
               Greg Corrado and
               Macduff Hughes and
               Jeffrey Dean},
  title     = {Google's Neural Machine Translation System: Bridging the Gap between
               Human and Machine Translation},
  journal   = {CoRR},
  volume    = {abs/1609.08144},
  year      = {2016},
  url       = {http://arxiv.org/abs/1609.08144},
  archivePrefix = {arXiv},
  eprint    = {1609.08144},
  timestamp = {Thu, 14 Mar 2019 09:34:18 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/WuSCLNMKCGMKSJL16.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/ZhengMWCYML16,
  author    = {Shuxin Zheng and
               Qi Meng and
               Taifeng Wang and
               Wei Chen and
               Nenghai Yu and
               Zhiming Ma and
               Tie{-}Yan Liu},
  title     = {Asynchronous Stochastic Gradient Descent with Delay Compensation for
               Distributed Deep Learning},
  journal   = {CoRR},
  volume    = {abs/1609.08326},
  year      = {2016},
  url       = {http://arxiv.org/abs/1609.08326},
  archivePrefix = {arXiv},
  eprint    = {1609.08326},
  timestamp = {Mon, 13 Aug 2018 16:46:27 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/ZhengMWCYML16.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/ChenMBJ16,
  author    = {Jianmin Chen and
               Rajat Monga and
               Samy Bengio},
  title     = {Revisiting Distributed Synchronous {SGD}},
  journal   = {CoRR},
  volume    = {abs/1604.00981},
  year      = {2016},
  url       = {http://arxiv.org/abs/1604.00981},
  archivePrefix = {arXiv},
  eprint    = {1604.00981},
  timestamp = {Mon, 13 Aug 2018 16:48:43 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/ChenMBJ16.bib},
}

@inproceedings{DBLP:slide,
  author    = {Beidi Chen and
               Tharun Medini and
               James Farwell and
               Sameh Gobriel and
               Charlie Tai and
               Anshumali Shrivastava},
  title     = {{SLIDE} : In Defense of Smart Algorithms over Hardware Acceleration
               for Large-Scale Deep Learning Systems},
  booktitle = {MLSys},
  pages     ={1--16},
  year      = {2020},
 }

@inproceedings{zaharia2010delay,
  title={Delay scheduling: a simple technique for achieving locality and fairness in cluster scheduling},
  author={Zaharia, Matei and Borthakur, Dhruba and Sen Sarma, Joydeep and Elmeleegy, Khaled and Shenker, Scott and Stoica, Ion},
  booktitle={Proceedings of the 5th European conference on Computer systems},
  pages={265--278},
  year={2010}
}

@inproceedings{megdet,
  title={Megdet: A large mini-batch object detector},
  author={Peng, Chao and Xiao, Tete and Li, Zeming and Jiang, Yuning and Zhang, Xiangyu and Jia, Kai and Yu, Gang and Sun, Jian},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={6181--6189},
  year={2018}
}

@inproceedings{cnn1,
  title={Imagenet classification with deep convolutional neural networks},
  author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
  booktitle={Advances in neural information processing systems},
  pages={1097--1105},
  year={2012}
}

@incollection{bottou2010large,
  title={Large-scale machine learning with stochastic gradient descent},
  author={Bottou, L{\'e}on},
  booktitle={Proceedings of COMPSTAT'2010},
  pages={177--186},
  year={2010},
  publisher={Springer}
}

@inproceedings{cnn2,
  title={Going deeper with convolutions},
  author={Szegedy, Christian and Liu, Wei and Jia, Yangqing and Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich, Andrew},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1--9},
  year={2015}
}

@article{gap1,
  title={On large-batch training for deep learning: Generalization gap and sharp minima},
  author={Keskar, Nitish Shirish and Mudigere, Dheevatsa and Nocedal, Jorge and Smelyanskiy, Mikhail and Tang, Ping Tak Peter},
  journal={arXiv preprint arXiv:1609.04836},
  year={2016}
}


@inproceedings{tvm,
  title={TVM: An automated end-to-end optimizing compiler for deep learning},
  author={Chen, Tianqi and Moreau, Thierry and Jiang, Ziheng and Zheng, Lianmin and Yan, Eddie and Shen, Haichen and Cowan, Meghan and Wang, Leyuan and Hu, Yuwei and Ceze, Luis},
  booktitle={13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)},
  pages={578--594},
  year={2018}
}

@inproceedings{large1,
  title={Accurate, large minibatch sgd: Training imagenet in 1 hour},
  author={Goyal, Priya and Doll{\'a}r, Piotr and Girshick, Ross and Noordhuis, Pieter and Wesolowski, Lukasz and Kyrola, Aapo and Tulloch, Andrew and Jia, Yangqing and He, Kaiming},
  booktitle={arXiv preprint arXiv:1706.02677},
  year={2017}
}

@article{DBLP:journals/corr/abs-1802-05799,
  author    = {Alexander Sergeev and
               Mike Del Balso},
  title     = {Horovod: fast and easy distributed deep learning in TensorFlow},
  journal   = {CoRR},
  volume    = {1},
  year      = {2018},
  url       = {http://arxiv.org/abs/1802.05799},
  archivePrefix = {arXiv},
  timestamp = {Mon, 13 Aug 2018 16:46:12 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1802-05799},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/cacm/KrizhevskySH17,
  author    = {Alex Krizhevsky and
               Ilya Sutskever and
               Geoffrey E. Hinton},
  title     = {ImageNet classification with deep convolutional neural networks},
  journal   = {Commun. {ACM}},
  volume    = {60},
  number    = {6},
  pages     = {84--90},
  year      = {2017},
  url       = {http://doi.acm.org/10.1145/3065386},
  doi       = {10.1145/3065386},
  timestamp = {Sun, 02 Jun 2019 20:48:58 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/cacm/KrizhevskySH17},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{large2,
  title={Imagenet training in minutes},
  author={You, Yang and Zhang, Zhao and Hsieh, Cho-Jui and Demmel, James and Keutzer, Kurt},
  booktitle={Proceedings of the 47th International Conference on Parallel Processing},
  pages={1},
  year={2018},
  organization={ACM}
}

@inproceedings{ps,
  title={Scaling distributed machine learning with the parameter server},
  author={Li, Mu and Andersen, David G and Park, Jun Woo and Smola, Alexander J and Ahmed, Amr and Josifovski, Vanja and Long, James and Shekita, Eugene J and Su, Bor-Yiing},
  booktitle={11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14)},
  pages={583--598},
  year={2014}
}

@inproceedings{asy1,
  title={Asynchrony begets momentum, with an application to deep learning},
  author={Mitliagkas, Ioannis and Zhang, Ce and Hadjis, Stefan and R{\'e}, Christopher},
  booktitle={2016 54th Annual Allerton Conference on Communication, Control, and Computing (Allerton)},
  pages={997--1004},
  year={2016},
  organization={IEEE}
}

@inproceedings{asy2,
  title={Hogwild: A lock-free approach to parallelizing stochastic gradient descent},
  author={Recht, Benjamin and Re, Christopher and Wright, Stephen and Niu, Feng},
  booktitle={Advances in neural information processing systems},
  pages={693--701},
  year={2011}
}

@inproceedings{DBLP:conf/hpca/HazelwoodBBCDDF18,
  author    = {Kim M. Hazelwood and
               Sarah Bird and
               David M. Brooks and
               Soumith Chintala and
               Utku Diril and
               Dmytro Dzhulgakov and
               Mohamed Fawzy and
               Bill Jia and
               Yangqing Jia and
               Aditya Kalro and
               James Law and
               Kevin Lee and
               Jason Lu and
               Pieter Noordhuis and
               Misha Smelyanskiy and
               Liang Xiong and
               Xiaodong Wang},
  title     = {Applied Machine Learning at Facebook: {A} Datacenter Infrastructure
               Perspective},
  booktitle = {{IEEE} International Symposium on High Performance Computer Architecture,
               {HPCA} 2018, Vienna, Austria, February 24-28, 2018},
  pages     = {620--629},
  year      = {2018},
  crossref  = {DBLP:conf/hpca/2018},
  url       = {https://doi.org/10.1109/HPCA.2018.00059},
  doi       = {10.1109/HPCA.2018.00059},
}

@inproceedings{mxnet,
  title={Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems},
  author={Chen, Tianqi and Li, Mu and Li, Yutian and Lin, Min and Wang, Naiyan and Wang, Minjie and Xiao, Tianjun and Xu, Bing and Zhang, Chiyuan and Zhang, Zheng},
  journal={CoRR},
  pages={1--1},
  year={2015}
}

@inproceedings{datap2,
  title={Large scale distributed deep networks},
  author={Dean, Jeffrey and Corrado, Greg and Monga, Rajat and Chen, Kai and Devin, Matthieu and Mao, Mark and Senior, Andrew and Tucker, Paul and Yang, Ke and Le, Quoc V and others},
  booktitle={Advances in neural information processing systems},
  pages={1223--1231},
  year={2012}
}

@inproceedings{modelp,
  title={Deep learning with COTS HPC systems},
  author={Coates, Adam and Huval, Brody and Wang, Tao and Wu, David and Catanzaro, Bryan and Andrew, Ng},
  booktitle={International conference on machine learning},
  pages={1337--1345},
  year={2013}
}

@article{mobilenet,
  title={Mobilenets: Efficient convolutional neural networks for mobile vision applications},
  author={Howard, Andrew G and Zhu, Menglong and Chen, Bo and Kalenichenko, Dmitry and Wang, Weijun and Weyand, Tobias and Andreetto, Marco and Adam, Hartwig},
  journal={arXiv preprint arXiv:1704.04861},
  year={2017}
}

@incollection{sgd,
  title={Large-scale machine learning with stochastic gradient descent},
  author={Bottou, L{\'e}on},
  booktitle={Proceedings of COMPSTAT'2010},
  pages={177--186},
  year={2010},
  publisher={Springer}
}

@inproceedings{resnet,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@article{multicores1,
  title={Amdahl's law in the multicore era},
  author={Hill, Mark D and Marty, Michael R},
  journal={Computer},
  volume={41},
  number={7},
  pages={33--38},
  year={2008},
  publisher={IEEE}
}

@inproceedings{multicores2,
  title={Map-reduce for machine learning on multicore},
  author={Chu, Cheng-Tao and Kim, Sang K and Lin, Yi-An and Yu, YuanYuan and Bradski, Gary and Olukotun, Kunle and Ng, Andrew Y},
  booktitle={Advances in neural information processing systems},
  pages={281--288},
  year={2007}
}

@inproceedings{imagenet,
  title={Imagenet: A large-scale hierarchical image database},
  author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li},
  booktitle={2009 IEEE conference on computer vision and pattern recognition},
  pages={248--255},
  year={2009},
  organization={Ieee}
}

@misc{mkldnn,
  howpublished = {\url{https://github.com/intel/mkl-dnn}},
  title = {Intel(r) math kernel library for deep neural networks (intel(r) mkl-dnn)}
}

% @article{mkldnn,
%   title={https://github.com/intel/mkl-dnn}
% }

@inproceedings{tf,
  title={Tensorflow: A system for large-scale machine learning},
  author={Abadi, Mart{\'\i}n and Barham, Paul and Chen, Jianmin and Chen, Zhifeng and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and Ghemawat, Sanjay and Irving, Geoffrey and Isard, Michael and others},
  booktitle={12th $\{$USENIX$\}$ Symposium on Operating Systems Design and Implementation ($\{$OSDI$\}$ 16)},
  pages={265--283},
  year={2016}
}

@inproceedings{caffe,
  title={Caffe: Convolutional architecture for fast feature embedding},
  author={Jia, Yangqing and Shelhamer, Evan and Donahue, Jeff and Karayev, Sergey and Long, Jonathan and Girshick, Ross and Guadarrama, Sergio and Darrell, Trevor},
  booktitle={Proceedings of the 22nd ACM international conference on Multimedia},
  pages={675--678},
  year={2014},
  organization={ACM}
}

@article{lstm,
  title={Long short-term memory},
  author={Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
  journal={Neural computation},
  volume={9},
  number={8},
  pages={1735--1780},
  year={1997},
  publisher={MIT Press}
}

@article{cnn3,
  title={Very deep convolutional networks for large-scale image recognition},
  author={Simonyan, Karen and Zisserman, Andrew},
  journal={arXiv preprint arXiv:1409.1556},
  year={2014}
}

@book{nlp1,
  title={Foundations of statistical natural language processing},
  author={Manning, Christopher D and Manning, Christopher D and Sch{\"u}tze, Hinrich},
  year={1999},
  publisher={MIT press}
}

@inproceedings{nlp2,
  title={Speech recognition with deep recurrent neural networks},
  author={Graves, Alex and Mohamed, Abdel-rahman and Hinton, Geoffrey},
  booktitle={2013 IEEE international conference on acoustics, speech and signal processing},
  pages={6645--6649},
  year={2013},
  organization={IEEE}
}

@inproceedings{nlp3,
  title={Recurrent neural network based language model},
  author={Mikolov, Tom{\'a}{\v{s}} and Karafi{\'a}t, Martin and Burget, Luk{\'a}{\v{s}} and {\v{C}}ernock{\`y}, Jan and Khudanpur, Sanjeev},
  booktitle={Eleventh annual conference of the international speech communication association},
  year={2010}
}

@article{nlp4,
  title={A learning algorithm for continually running fully recurrent neural networks},
  author={Williams, Ronald J and Zipser, David},
  journal={Neural computation},
  volume={1},
  number={2},
  pages={270--280},
  year={1989},
  publisher={MIT Press}
}

@article{recommender1,
  title={Evaluating collaborative filtering recommender systems},
  author={Herlocker, Jonathan L and Konstan, Joseph A and Terveen, Loren G and Riedl, John T},
  journal={ACM Transactions on Information Systems (TOIS)},
  volume={22},
  number={1},
  pages={5--53},
  year={2004},
  publisher={ACM}
}

@inproceedings{recommender3,
  title={Deep neural networks for youtube recommendations},
  author={Covington, Paul and Adams, Jay and Sargin, Emre},
  booktitle={Proceedings of the 10th ACM conference on recommender systems},
  pages={191--198},
  year={2016},
  organization={ACM}
}

@inproceedings{ncf,
  title={Neural collaborative filtering},
  author={He, Xiangnan and Liao, Lizi and Zhang, Hanwang and Nie, Liqiang and Hu, Xia and Chua, Tat-Seng},
  booktitle={Proceedings of the 26th international conference on world wide web},
  pages={173--182},
  year={2017},
  organization={International World Wide Web Conferences Steering Committee}
}

@inproceedings{cudnn,
  title={cudnn: Efficient primitives for deep learning},
  author={Chetlur, Sharan and Woolley, Cliff and Vandermersch, Philippe and Cohen, Jonathan and Tran, John and Catanzaro, Bryan and Shelhamer, Evan},
  journal={CoRR},
  year={2014}
}

@article{memorybound,
  title={Moderately hard, memory-bound functions},
  author={Abadi, Martin and Burrows, Mike and Manasse, Mark and Wobber, Ted},
  journal={ACM Transactions on Internet Technology (TOIT)},
  volume={5},
  number={2},
  pages={299--327},
  year={2005},
  publisher={ACM}
}

@book{openmp,
  title={Parallel programming in OpenMP},
  author={Chandra, Rohit and Dagum, Leo and Kohr, David and Menon, Ramesh and Maydan, Dror and McDonald, Jeff},
  year={2001},
  publisher={Morgan kaufmann}
}

@book{openmp2,
  title={Using OpenMP: portable shared memory parallel programming},
  author={Chapman, Barbara and Jost, Gabriele and Van Der Pas, Ruud},
  volume={10},
  year={2008},
  publisher={MIT press}
}

@inproceedings{sync1,
  title={On parallelizability of stochastic gradient descent for speech dnns},
  author={Seide, Frank and Fu, Hao and Droppo, Jasha and Li, Gang and Yu, Dong},
  booktitle={2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={235--239},
  year={2014},
  organization={IEEE}
}

@inproceedings{sync2,
  title={Parallelized stochastic gradient descent},
  author={Zinkevich, Martin and Weimer, Markus and Li, Lihong and Smola, Alex J},
  booktitle={Advances in neural information processing systems},
  pages={2595--2603},
  year={2010}
}

@inproceedings{numa,
  title={Operating system for a non-uniform memory access multiprocessor system},
  author={Kimmel, Jeffrey S and Alfieri, Robert A and Miles, A and McGrath, William K and McLeod, Michael J and O'connell, Mark A and Simpson, Guy A},
  year={2000},
  booktitle={Google Patents},
}

@inproceedings{core-memory,
  title={Thousand core chipsa technology perspective},
  author={Borkar, Shekhar},
  booktitle={2007 44th ACM/IEEE Design Automation Conference},
  pages={746--749},
  year={2007},
  organization={IEEE}
}

@inproceedings{mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={4510--4520},
  year={2018}
}

@article{linear,
  title={One weird trick for parallelizing convolutional neural networks},
  author={Krizhevsky, Alex},
  journal={arXiv preprint arXiv:1404.5997},
  year={2014}
}

@inproceedings{gemm-size,
  title={A note on auto-tuning GEMM for GPUs},
  author={Li, Yinan and Dongarra, Jack and Tomov, Stanimire},
  booktitle={International Conference on Computational Science},
  pages={884--892},
  year={2009},
  organization={Springer}
}

@article{memwall,
  title={Hitting the memory wall: implications of the obvious},
  author={Wulf, Wm A and McKee, Sally A},
  journal={ACM SIGARCH computer architecture news},
  volume={23},
  number={1},
  pages={20--24},
  year={1995},
  publisher={ACM}
}

@inproceedings{int8,
  title={Quantization and training of neural networks for efficient integer-arithmetic-only inference},
  author={Jacob, Benoit and Kligys, Skirmantas and Chen, Bo and Zhu, Menglong and Tang, Matthew and Howard, Andrew and Adam, Hartwig and Kalenichenko, Dmitry},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={2704--2713},
  year={2018}
}

@inproceedings{xnornet,
  title={Xnor-net: Imagenet classification using binary convolutional neural networks},
  author={Rastegari, Mohammad and Ordonez, Vicente and Redmon, Joseph and Farhadi, Ali},
  booktitle={European Conference on Computer Vision},
  pages={525--542},
  year={2016},
  organization={Springer}
}

@article{twn,
  title={Ternary weight networks},
  author={Li, Fengfu and Zhang, Bo and Liu, Bin},
  journal={arXiv preprint arXiv:1605.04711},
  year={2016}
}

@inproceedings{bwn,
  title={Binaryconnect: Training deep neural networks with binary weights during propagations},
  author={Courbariaux, Matthieu and Bengio, Yoshua and David, Jean-Pierre},
  booktitle={Advances in neural information processing systems},
  pages={3123--3131},
  year={2015}
}

@article{glow,
  title={Glow: Graph lowering compiler techniques for neural networks},
  author={Rotem, Nadav and Fix, Jordan and Abdulrasool, Saleem and Catron, Garret and Deng, Summer and Dzhabarov, Roman and Gibson, Nick and Hegeman, James and Lele, Meghan and Levenstein, Roman and others},
  journal={arXiv preprint arXiv:1805.00907},
  year={2018}
}

@article{tensorcom,
  title={Tensor comprehensions: Framework-agnostic high-performance machine learning abstractions},
  author={Vasilache, Nicolas and Zinenko, Oleksandr and Theodoridis, Theodoros and Goyal, Priya and DeVito, Zachary and Moses, William S and Verdoolaege, Sven and Adams, Andrew and Cohen, Albert},
  journal={arXiv preprint arXiv:1802.04730},
  year={2018}
}

@article{gemmsize2,
  title={The design and performance of batched BLAS on modern high-performance computing systems},
  author={Dongarra, Jack and Hammarling, Sven and Higham, Nicholas J and Relton, Samuel D and Valero-Lara, Pedro and Zounon, Mawussi},
  journal={Procedia Computer Science},
  volume={108},
  pages={495--504},
  year={2017},
  publisher={Elsevier}
}

@inproceedings{gemmsize3,
  title={Tensor contractions with extended blas kernels on cpu and gpu},
  author={Shi, Yang and Niranjan, Uma Naresh and Anandkumar, Animashree and Cecka, Cris},
  booktitle={2016 IEEE 23rd International Conference on High Performance Computing (HiPC)},
  pages={193--202},
  year={2016},
  organization={IEEE}
}

@inproceedings{WMT,
  title={Edinburgh’s phrase-based machine translation systems for WMT-14},
  author={Durrani, Nadir and Haddow, Barry and Koehn, Philipp and Heafield, Kenneth},
  booktitle={Proceedings of the Ninth Workshop on Statistical Machine Translation},
  pages={97--104},
  year={2014}
}

@article{sharedmem,
  title={Shared memory consistency models: A tutorial},
  author={Adve, Sarita V and Gharachorloo, Kourosh},
  journal={computer},
  volume={29},
  number={12},
  pages={66--76},
  year={1996},
  publisher={IEEE}
}

@article{blocks,
  title={High-Performance Deep Learning via a Single Building Block},
  author={Georganas, Evangelos and Banerjee, Kunal and Kalamkar, Dhiraj and Avancha, Sasikanth and Venkat, Anand and Anderson, Michael and Henry, Greg and Pabst, Hans and Heinecke, Alexander},
  journal={arXiv preprint arXiv:1906.06440},
  year={2019}
}

@article{movielens,
  title={https://grouplens.org/datasets/movielens/20m/}
}

@article{yadan2013multi,
  title={Multi-gpu training of convnets},
  author={Yadan, Omry and Adams, Keith and Taigman, Yaniv and Ranzato, Marc'Aurelio},
  journal={arXiv preprint arXiv:1312.5853},
  year={2013}
}

@article{simt,
  title={CUDASW++ 2.0: enhanced Smith-Waterman protein database search on CUDA-enabled GPUs based on SIMT and virtualized SIMD abstractions},
  author={Liu, Yongchao and Schmidt, Bertil and Maskell, Douglas L},
  journal={BMC research notes},
  volume={3},
  number={1},
  pages={93},
  year={2010},
  publisher={BioMed Central}
}

@inproceedings{ddr4,
  title={Understanding and mitigating refresh overheads in high-density DDR4 DRAM systems},
  author={Mukundan, Janani and Hunter, Hillery and Kim, Kyu-hyoun and Stuecheli, Jeffrey and Mart{\'\i}nez, Jos{\'e} F},
  booktitle={ACM SIGARCH Computer Architecture News},
  volume={41},
  number={3},
  pages={48--59},
  year={2013},
  organization={ACM}
}

@article{url_intel_infer,
  title    = {\url{https://blogs.nvidia.com/blog/2019/05/21/intel-inference-nvidia-gpus/}}
}

@article{vtune,
  title={https://software.intel.com/en-us/vtune}
}

@article{cold,
  title={https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html}
}

@article{cifar10,
  title={https://www.cs.toronto.edu/~kriz/cifar.html}
}

@article{aws,
  title={https://aws.amazon.com/}
}


@article{sherlock,
  title={https://www.kaggle.com/idevji1/sherlock-holmes-stories}
}

@article{parax_anony,
  title={https://github.com/anonymous-nicer/parax}
}

@article{parax_anony2,
  title={Link to repository removed for double blind review.}
}

@article{amazon,
  title={docs.aws.amazon.com/dlami/latest/devguide/deep-learning-containers-eks-tutorials-cpu-training.html}
}

@article{8280,
  title={https://software.intel.com/en-us/articles/performance-boosting-in-seldon}
}

@article{8280spec,
  title={https://ark.intel.com/content/www/us/en/ark/products/192478/intel-xeon-platinum-8280-processor-38-5m-cache-2-70-ghz.html}
}

@article{cuda,
  title={https://developer.nvidia.com/about-cuda}
}

@article{tesla_flops,
  title={https://wccftech.com/nvidia-pascal-gpu-gtc-2016/}
}

@article{tesla,
  title={https://www.nvidia.com/en-us/data-center/tesla-p100/}
}

@article{djl,
https://github.com/deepjavalibrary/djl
}

@article{avx512,
  title={https://software.intel.com/en-us/articles/intel-avx-512-instructions}
}

@article{a670k,
  title={https://www.kaggle.com/c/extreme-classification-amazon/data}
}

@article{intel9282,
  title={https://cpubase.com/compare/intel-xeon-platinum-8280-vs-intel-xeon-platinum-9282}
}

@article{v100,
  title={https://www.nvidia.com/en-us/data-center/v100/}
}


@inproceedings{don,
  title={Don't Use Large Mini-Batches, Use Local SGD},
  author={Lin, Tao  and  Stich, Sebastian U  and  Patel, Kumar Kshitij  and  Jaggi, Martin},
  booktitle={CoRR}
  year={2018},
}

@inproceedings{don2,
  title={Don't Decay the Learning Rate, Increase the Batch Size},
  author={Smith, Samuel L and Kindermans, Pieter Jan and Ying, Chris and Le, Quoc V},
  booktitle={CoRR}
  year={2017},
}

@inproceedings{don3,
  title={A Bayesian Perspective on Generalization and Stochastic Gradient Descent},
  author={ Smith, Samuel L  and  Le, Quoc V},
  booktitle={CoRR}
  year={2017},
}

@inproceedings{pytorch,
  title={Pytorch: An imperative style, high-performance deep learning library},
  author={Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and others},
  booktitle={Advances in neural information processing systems},
  pages={8026--8037},
  year={2019}
}

@inproceedings{imagenet,
  title={Imagenet: A large-scale hierarchical image database},
  author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li},
  booktitle={2009 IEEE conference on computer vision and pattern recognition},
  pages={248--255},
  year={2009},
  organization={Ieee}
}

@misc{mkldnn,
  howpublished = {\url{https://github.com/intel/mkl-dnn}},
  title = {Intel(r) math kernel library for deep neural networks (intel(r) mkl-dnn)}
}

@inproceedings{tf,
  title={Tensorflow: A system for large-scale machine learning},
  author={Abadi, Mart{\'\i}n and Barham, Paul and Chen, Jianmin and Chen, Zhifeng and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and Ghemawat, Sanjay and Irving, Geoffrey and Isard, Michael and others},
  booktitle={12th $\{$USENIX$\}$ Symposium on Operating Systems Design and Implementation ($\{$OSDI$\}$ 16)},
  pages={265--283},
  year={2016}
}

@inproceedings{caffe,
  title={Caffe: Convolutional architecture for fast feature embedding},
  author={Jia, Yangqing and Shelhamer, Evan and Donahue, Jeff and Karayev, Sergey and Long, Jonathan and Girshick, Ross and Guadarrama, Sergio and Darrell, Trevor},
  booktitle={Proceedings of the 22nd ACM international conference on Multimedia},
  pages={675--678},
  year={2014},
  organization={ACM}
}

@article{lstm,
  title={Long short-term memory},
  author={Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
  journal={Neural computation},
  volume={9},
  number={8},
  pages={1735--1780},
  year={1997},
  publisher={MIT Press}
}

@article{cnn3,
  title={Very deep convolutional networks for large-scale image recognition},
  author={Simonyan, Karen and Zisserman, Andrew},
  journal={arXiv preprint arXiv:1409.1556},
  year={2014}
}

@book{nlp1,
  title={Foundations of statistical natural language processing},
  author={Manning, Christopher D and Manning, Christopher D and Sch{\"u}tze, Hinrich},
  year={1999},
  publisher={MIT press}
}

@inproceedings{nlp2,
  title={Speech recognition with deep recurrent neural networks},
  author={Graves, Alex and Mohamed, Abdel-rahman and Hinton, Geoffrey},
  booktitle={2013 IEEE international conference on acoustics, speech and signal processing},
  pages={6645--6649},
  year={2013},
  organization={IEEE}
}

@inproceedings{nlp3,
  title={Recurrent neural network based language model},
  author={Mikolov, Tom{\'a}{\v{s}} and Karafi{\'a}t, Martin and Burget, Luk{\'a}{\v{s}} and {\v{C}}ernock{\`y}, Jan and Khudanpur, Sanjeev},
  booktitle={Eleventh annual conference of the international speech communication association},
  year={2010}
}

@article{nlp4,
  title={A learning algorithm for continually running fully recurrent neural networks},
  author={Williams, Ronald J and Zipser, David},
  journal={Neural computation},
  volume={1},
  number={2},
  pages={270--280},
  year={1989},
  publisher={MIT Press}
}

@inproceedings{recommender2,
  title={Matrix factorization techniques for recommender systems},
  author={Koren, Yehuda and Bell, Robert and Volinsky, Chris},
  booktitle={Computer},
  pages={30--37},
  year={2009},
  organization={IEEE}
}

@article{recommender1,
  title={Evaluating collaborative filtering recommender systems},
  author={Herlocker, Jonathan L and Konstan, Joseph A and Terveen, Loren G and Riedl, John T},
  journal={ACM Transactions on Information Systems (TOIS)},
  volume={22},
  number={1},
  pages={5--53},
  year={2004},
  publisher={ACM}
}

@inproceedings{recommender3,
  title={Deep neural networks for youtube recommendations},
  author={Covington, Paul and Adams, Jay and Sargin, Emre},
  booktitle={Proceedings of the 10th ACM conference on recommender systems},
  pages={191--198},
  year={2016},
  organization={ACM}
}

@inproceedings{ncf,
  title={Neural collaborative filtering},
  author={He, Xiangnan and Liao, Lizi and Zhang, Hanwang and Nie, Liqiang and Hu, Xia and Chua, Tat-Seng},
  booktitle={Proceedings of the 26th international conference on world wide web},
  pages={173--182},
  year={2017},
  organization={International World Wide Web Conferences Steering Committee}
}

@article{cudnn,
  title={cudnn: Efficient primitives for deep learning},
  author={Chetlur, Sharan and Woolley, Cliff and Vandermersch, Philippe and Cohen, Jonathan and Tran, John and Catanzaro, Bryan and Shelhamer, Evan},
  journal={arXiv preprint arXiv:1410.0759},
  year={2014}
}


@inproceedings{cnn2,
  title={Going deeper with convolutions},
  author={Szegedy, Christian and Liu, Wei and Jia, Yangqing and Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich, Andrew},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1--9},
  year={2015}
}

@inproceedings{cnn1,
  title={Imagenet classification with deep convolutional neural networks},
  author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
  booktitle={Advances in neural information processing systems},
  pages={1097--1105},
  year={2012}
}


@inproceedings{mkldnn1,
  title={Distributed deep learning using synchronous stochastic gradient descent},
  author={Das, Dipankar and Avancha, Sasikanth and Mudigere, Dheevatsa and Vaidynathan, Karthikeyan and Sridharan, Srinivas and Kalamkar, Dhiraj and Kaul, Bharat and Dubey, Pradeep},
  booktitle={arXiv},
  year={2016}
}


@article{cudnn,
  title={cudnn: Efficient primitives for deep learning},
  author={Chetlur, Sharan and Woolley, Cliff and Vandermersch, Philippe and Cohen, Jonathan and Tran, John and Catanzaro, Bryan and Shelhamer, Evan},
  journal={arXiv preprint arXiv:1410.0759},
  pages={1--1},
  year={2014}
}


@article{simt,
  title={CUDASW++ 2.0: enhanced Smith-Waterman protein database search on CUDA-enabled GPUs based on SIMT and virtualized SIMD abstractions},
  author={Liu, Yongchao and Schmidt, Bertil and Maskell, Douglas L},
  journal={BMC research notes},
  volume={3},
  number={1},
  pages={93},
  year={2010},
  publisher={BioMed Central}
}

@inproceedings{howard2017mobilenets,
  title={Mobilenets: Efficient convolutional neural networks for mobile vision applications},
  author={Howard, Andrew G and Zhu, Menglong and Chen, Bo and Kalenichenko, Dmitry and Wang, Weijun and Weyand, Tobias and Andreetto, Marco and Adam, Hartwig},
  booktitle={arXiv preprint arXiv:1704.04861},
  year={2017}
}

@inproceedings{datap1,
  title={Revisiting distributed synchronous SGD},
  author={Chen, Jianmin and Pan, Xinghao and Monga, Rajat and Bengio, Samy and Jozefowicz, Rafal},
  booktitle={arXiv preprint arXiv:1604.00981},
  year={2016}
}

@inproceedings{bn,
  title={Batch normalization: Accelerating deep network training by reducing internal covariate shift},
  author={Ioffe, Sergey and Szegedy, Christian},
  booktitle={arXiv preprint arXiv:1502.03167},
  year={2015}
}

@article{rar,
  title={Bandwidth optimal all-reduce algorithms for clusters of workstations},
  author={ Patarasuk, Pitch  and  Yuan, Xin },
  journal={Journal of Parallel \& Distributed Computing},
  volume={69},
  number={ 2},
  pages={117-124},
  year={2009},
}

@inproceedings{DBLP:slide,
  author    = {Beidi Chen and
               Tharun Medini and
               James Farwell and
               Sameh Gobriel and
               Charlie Tai and
               Anshumali Shrivastava},
  title     = {{SLIDE} : In Defense of Smart Algorithms over Hardware Acceleration
               for Large-Scale Deep Learning Systems},
  booktitle = {MLSys},
  pages     ={1--16},
  year      = {2020},
 }

@inproceedings{2018DeepCPU,
  title={DeepCPU: Serving RNN-based Deep Learning Models 10x Faster},
  author={ Zhang, Minjia  and  Rajbhandari, Samyam  and  Wang, Wenhan  and  He, Yuxiong },
  booktitle={USENIX Annual Technical Conference},
  year={2018},
}